Load packages, functions, paths
library(dplyr)
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.3
library(sas7bdat) ##read SAS files
"%&%" = function(a,b) paste(a,b,sep="")
my.dir = "/Volumes/dolan-lab/hwheeler/ThePlatinumStudy/GWAS/genotypes/"
pca.dir = "/Volumes/dolan-lab/hwheeler/ThePlatinumStudy/GWAS/PCA/"
dem.dir = "/Volumes/dolan-lab/hwheeler/ThePlatinumStudy/Ototoxicity_modeling/from_Alicia_Brocht_20150416/sas-files/"
Call rate distributions
##look at distribution of SNP F_MISS (proportion of sample missing this SNP)
lmiss <- read.table(my.dir %&% "N88_Recluster_TOP_20150911_FinalReport.lmiss",header=T)
hist(lmiss$F_MISS)

##SNP count at start
dim(lmiss)[1]
## [1] 964193
##SNPs with call rates > 99%
table(lmiss$F_MISS<0.01)
##
## FALSE TRUE
## 30486 933707
##percent SNPs with call rates > 99%
table(lmiss$F_MISS<0.01)/sum(table(lmiss$F_MISS<0.01))
##
## FALSE TRUE
## 0.03161815 0.96838185
##after removing SNPs with < 99% call rates, look at sample F_MISS (proportion of missing SNPs)
imiss <- read.table(my.dir %&% "N88_Recluster_TOP_20150911_FinalReport.geno0.01.imiss",header=T)
hist(imiss$F_MISS)

##looks great, all individuals now have >99.2% call rates
newlmiss <- read.table(my.dir %&% "N88_Recluster_TOP_20150911_FinalReport.geno0.01.lmiss",header=T)
hist(newlmiss$F_MISS)

##SNP and individual count after rm low-call SNPs
dim(newlmiss)[1]
## [1] 933707
dim(imiss)[1]
## [1] 1145
Check IBD
##PLINK options: --genome --min 0.05 (only output pairs with PI_HAT > 0.05)
ibd <- read.table(my.dir %&% "N88_Recluster_TOP_20150911_FinalReport.geno0.01.LD0.3.genome",header=T)
ggplot(data=ibd,aes(x=Z0,y=Z1))+geom_point(alpha=1/4)+theme_bw()

##pull duplicates
dups <- data.frame()
for(i in 1:dim(ibd)[1]){
if(as.character(ibd$IID1[i]) == as.character(ibd$IID2[i])){
dups <- rbind(dups,ibd[i,])
}
}
dim(dups)
## [1] 46 14
##expected 48 dups, see 46
##missing: 31636 (Plate03_D09 was not in n=1145 genotype set)
##missing: 31505 (Plate07_E08 matches (pihat=1) 31513 (Plate01_H04), rather than the expected 31505 (Plate07_G05))
##pull hapmap samples
hapmap <- filter(ibd,grepl('NA',IID1))
hapmap
## FID1 IID1 FID2 IID2 RT EZ Z0 Z1
## 1 N88_Plate01_G03 NA12878 N88_Plate02_F05 NA12892 UN NA 0.0012 0.9847
## 2 N88_Plate01_G03 NA12878 N88_Plate03_G12 NA12891 UN NA 0.0012 0.9807
## 3 N88_Plate01_G03 NA12878 N88_Plate07_B03 NA12891 UN NA 0.0011 0.9809
## 4 N88_Plate03_G12 NA12891 N88_Plate07_B03 NA12891 UN NA 0.0000 0.0000
## 5 N88_Plate04_G09 NA10857 N88_Plate05_G04 NA12044 UN NA 0.0011 0.9776
## 6 N88_Plate04_G09 NA10857 N88_Plate06_H09 NA12043 UN NA 0.0012 0.9732
## 7 N88_Plate04_G09 NA10857 N88_Plate10_C09 NA12043 UN NA 0.0012 0.9732
## 8 N88_Plate06_H09 NA12043 N88_Plate10_C09 NA12043 UN NA 0.0000 0.0000
## Z2 PI_HAT PHE DST PPC RATIO
## 1 0.0140 0.5064 -1 0.904617 1 1316.333
## 2 0.0181 0.5084 -1 0.905004 1 986.000
## 3 0.0180 0.5085 -1 0.905006 1 1315.000
## 4 1.0000 1.0000 -1 0.999995 1 NA
## 5 0.0213 0.5101 -1 0.905319 1 787.600
## 6 0.0256 0.5122 -1 0.905732 1 1321.333
## 7 0.0256 0.5122 -1 0.905732 1 1321.333
## 8 1.0000 1.0000 -1 1.000000 1 NA
##parent-child relationships are as expected
##pull others (exlude expected dups and hapmap)
toExclude <- c(as.character(dups$IID1),as.character(hapmap$IID1))
a <- as.character(ibd$IID1) %in% toExclude
others <- ibd[a==FALSE,]
dim(others)
## [1] 188 14
hist(others$PI_HAT)

sortOthers<-others[order(others$PI_HAT,decreasing=TRUE),]
##Unexpected duplicates:
filter(others,PI_HAT>=0.2)
## FID1 IID1 FID2 IID2 RT EZ Z0 Z1 Z2 PI_HAT PHE
## 1 N88_Plate01_A04 31715 N88_Plate11_F01 31714 UN NA 0 0 1 1 -1
## 2 N88_Plate01_F04 32575 N88_Plate02_D06 32553 UN NA 0 0 1 1 -1
## 3 N88_Plate01_G04 30096 N88_Plate12_A05 30104 UN NA 0 0 1 1 -1
## 4 N88_Plate01_H04 31513 N88_Plate07_E08 31505 UN NA 0 0 1 1 -1
## 5 N88_Plate02_C09 31537 N88_Plate06_A11 31549 UN NA 0 0 1 1 -1
## 6 N88_Plate05_C01 31534 N88_Plate09_C12 31586 UN NA 0 0 1 1 -1
## 7 N88_Plate07_H02 32527 N88_Plate12_F12 35005 UN NA 0 0 1 1 -1
## 8 N88_Plate12_G09 33564 N88_Plate12_F11 30217 UN NA 0 0 1 1 -1
## DST PPC RATIO
## 1 1.000000 1 NA
## 2 0.999997 1 NA
## 3 0.999997 1 NA
## 4 1.000000 1 NA
## 5 1.000000 1 NA
## 6 1.000000 1 NA
## 7 0.999997 1 NA
## 8 1.000000 1 NA
write.table(filter(others,PI_HAT>=0.2),my.dir %&% "PtStudy_revealed_duplicates.txt",quote=FALSE,row.names=FALSE)
## Above are due to likely sample mix-ups, won't know which of pair the DNA belongs to,
## thus, remove both from analysis (n=16 removed b/c sample mix-up)
## could be identical twins, but 8 pairs highly unlikely (Monozygotic twins: 3.5 per 1000 births.)
##UPDATE 11/18/2015: correct sample in duplicate pairs revealed by Meghann O'Brien in "GWAS QC concerns speadsheet check EJ notes_MO.xlsx" See "Sample that was not run and needs analysis" column in red.
##make new mixuplist variable of sample that was not genotyped
FID = c("N88_Plate11_F01","N88_Plate02_D06","N88_Plate01_G04","N88_Plate07_E08","N88_Plate02_C09","N88_Plate05_C01","N88_Plate07_H02","N88_Plate12_G09")
IID = c("31714","32553","30096","31505","31537","31534","32527","33564")
mixuplist = data.frame(FID,IID)
##RERUN --genome without known dups, hapmap, or revealed duplicates
##make known dups/hapmap list
hapmaplist1 <- select(hapmap,FID1,IID1)
hapmaplist2 <- select(hapmap,FID2,IID2)
colnames(hapmaplist1) <- c("FID","IID")
colnames(hapmaplist2) <- c("FID","IID")
hapmaplist <- unique(rbind(hapmaplist1,hapmaplist2))
dupslist <- select(dups,FID1,IID1) #only choose one in set of known duplicates
colnames(dupslist) <- c("FID","IID")
#mixup <-filter(others,PI_HAT>=0.9) #old code for making list of all revealed duplicates
#mixup1 <- select(mixup,FID1,IID1)
#mixup2 <- select(mixup,FID2,IID2)
#colnames(mixup1) <- c("FID","IID")
#colnames(mixup2) <- c("FID","IID")
#mixuplist <- unique(rbind(mixup1,mixup2)) #old code ends here
hapdup <- unique(rbind(hapmaplist,dupslist,mixuplist))
dim(mixuplist)
## [1] 8 2
write.table(hapdup,file=my.dir %&% "hapmapDuplicateList.txt",row.names=FALSE,quote=FALSE)
## number left after rm hapmap & known duplicates
dim(imiss)[1]-dim(hapdup)[1]+dim(mixuplist)[1]
## [1] 1093
## number left after rm mixup duplicates
dim(imiss)[1]-dim(hapdup)[1]
## [1] 1085
##input new genome file
##PLINK options: --genome --min 0.05 (only output pairs with PI_HAT > 0.05)
ibd2 <- read.table(my.dir %&% "N88_Recluster_TOP_20150911_FinalReport.geno0.01.LD0.3.rmKnownDupsHapmap.genome",header=TRUE)
ggplot(data=ibd2,aes(x=Z0,y=Z1))+geom_point(alpha=1/4)+theme_bw()+coord_cartesian(xlim=c(-0.05,1.05),ylim=c(-0.05,1.05))

hist(ibd2$PI_HAT)

##UPDATE 11/18/2015: for single variant GWAS, only exclude one sample if PI_HAT>0.125
ibd2<-filter(ibd2,PI_HAT>0.125)
##make list of samples with audiometry
audiofile <- "/Users/heather/GitHub/ThePlatinumStudy/Ototoxicity_modeling/Datasets_from_Ryan_Cook_20150928/audiometry.sas7bdat"
audio <- read.sas7bdat(audiofile) %>% mutate(PATNO=as.character(patno))
airl<-filter(audio,C_AUDTEST==1,C_EARLR==1)
audiopatno <- airl$patno
write.table(audiopatno,file=my.dir %&% "patientsWithAudiometry.txt",quote=FALSE,row.names=FALSE,col.names=FALSE)
##of pairs with pi_hat > 0.125, keep the one with audiometry, if possible.
excludemat <- matrix(0,ncol=2,nrow=dim(ibd2)[1])
for(i in 1:dim(ibd2)[1]){
pat1 <- as.character(ibd2[i,2])
pat2 <- as.character(ibd2[i,4])
if(pat1 %in% audiopatno){
excludemat[i,] <- c(as.character(ibd2[i,3]),pat2)
}else{
excludemat[i,] <- c(as.character(ibd2[i,1]),pat1)
}
}
excludemat=unique(excludemat)
colnames(excludemat)=c("FID","IID")
write.table(excludemat,file=my.dir %&% "pihat0.125_exclusion_list.txt",quote=FALSE,row.names=FALSE)
dim(excludemat)
## [1] 4 2
allexclude <- rbind(hapdup,excludemat)
write.table(allexclude,file=my.dir %&% "hapmapDuplicate_pihat0.125_exclusion_list.txt",quote=FALSE,row.names=FALSE)
##number left after rm pi_hat>0.05
dim(imiss)[1]-dim(hapdup)[1]-dim(excludemat)[1]
## [1] 1081
##PLINK options: --genome (output all pairs, after rm PI_HAT>0.125)
ibd3 <- read.table(my.dir %&% "N88_Recluster_TOP_20150911_FinalReport.geno0.01.LD0.3.rmKnownDupsHapmap.rmPIHAT0.125.genome",header=TRUE)
ggplot(data=ibd3,aes(x=Z0,y=Z1))+geom_point(alpha=1/4)+theme_bw()+coord_cartesian(xlim=c(-0.05,1.05),ylim=c(-0.05,1.05))

hist(ibd3$PI_HAT)

table(ibd3$PI_HAT>0.05)/sum(table(ibd3$PI_HAT>0.05))
##
## FALSE TRUE
## 0.9997070614 0.0002929386
Calculate HWE statistics
hwe <- read.table(my.dir %&% "N88_Recluster_TOP_20150911_FinalReport.geno0.01.hwe",header=T)
summary(hwe$P)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2681 0.6796 0.6173 1.0000 1.0000
hist(hwe$P)

##SNPs with HWE P < 1e-6
table(hwe$P<1e-06)
##
## FALSE TRUE
## 930450 1842
##percent SNPs with HWE P < 1e-6
table(hwe$P<1e-06)/sum(table(hwe$P<1e-06))
##
## FALSE TRUE
## 0.998024224 0.001975776
Check heterozygosity, flag any outliers for removal
hetfile <- "N88_Recluster_TOP_20150911_FinalReport.geno0.01.LD0.3.rmKnownDupsHapmap.rmPIHAT0.125.het"
HET <- read.table(my.dir %&% hetfile,header=T,as.is=T)
H = (HET$N.NM.-HET$O.HOM.)/HET$N.NM.
oldpar=par(mfrow=c(1,2))
hist(H,50)
hist(HET$F,50)
abline(v=mean(HET$F)+6*sd(HET$F),col="red")
abline(v=mean(HET$F)-6*sd(HET$F),col="red")

summary(HET$F)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.204500 0.007935 0.016810 0.012120 0.022790 0.146000
par(oldpar)
sortHET<-HET[order(HET$F),]
#sortHET[1:20,]
#sortHET[(dim(sortHET)[1]-20):dim(sortHET)[1],]
outliers<-data.frame()
for(i in 1:length(sortHET$F)){
if(sortHET[i,6] > (mean(sortHET$F)+6*sd(sortHET$F))){
outliers <- rbind(outliers,sortHET[i,])
}
if(sortHET[i,6] < (mean(sortHET$F)-6*sd(sortHET$F))){
outliers <- rbind(outliers,sortHET[i,])
}
}
hetoutliers <- select(outliers,FID,IID)
dim(hetoutliers)
## [1] 11 2
allexclude2 <- rbind(allexclude,hetoutliers)
write.table(allexclude2,file=my.dir %&% "hapmapDuplicate_pihat0.125_hetOutlier_exclusion_list.txt",quote=F,col.names=F,row.names=F)
##number left after rm heterozygosity outliers
dim(imiss)[1]-dim(hapdup)[1]-dim(excludemat)[1]-dim(hetoutliers)[1]
## [1] 1070
PCA Plots with HapMap3 unrelateds
hapmappopinfo <- read.table(pca.dir %&% "pop_HM3_hg19_forPCA.fam") %>% select(V1,V3)
colnames(hapmappopinfo) <- c("pop","IID")
fam <- read.table(my.dir %&% 'N88_HM3_LDpruned.fam') %>% select(V1,V2)
colnames(fam) <- c("FID","IID")
popinfo <- left_join(fam,hapmappopinfo,by="IID")
## Warning in left_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector
popinfo <- mutate(popinfo, pop=ifelse(grepl('T',IID),'Norway',as.character(pop))) %>% mutate(pop=ifelse(is.na(pop),'PtStudy',as.character(pop)))
table(popinfo$pop)
##
## ASN CEU Norway PtStudy YRI
## 170 111 243 827 110
pcs <- read.table(my.dir %&% "N88_HM3_LDpruned_2015-11-18.evec",skip=1)
#pcs <- read.table(my.dir %&% "N88_HM3_geno0.01_maf0.05_LDpruned.evec",skip=1)
pcdf <- data.frame(popinfo,pcs[,2:11]) %>% rename(PC1=V2,PC2=V3,PC3=V4,PC4=V5,PC5=V6,PC6=V7,PC7=V8,PC8=V9,PC9=V10,PC10=V11)
gwas <- filter(pcdf,pop=='PtStudy' | pop=='Norway') %>% mutate(PATNO=IID)
##add self-identified Hispanic info
subjqst <- read.sas7bdat(dem.dir %&% "subjqst.sas7bdat") %>% select(PATNO,C_RAASIAN,C_RAWHITE,C_HISPLAT,C_RAINDALS,C_RABLACK,C_RAHAWOPI)
dlist <- duplicated(subjqst$PATNO)
subjqst <- subjqst[dlist==FALSE,] ##rm duplicates in subjqst
gwas <- left_join(gwas,subjqst,by="PATNO")
## Warning in left_join_impl(x, y, by$x, by$y): joining factor and character
## vector, coercing into character vector
summary(gwas[,15:20])
## C_RAASIAN C_RAWHITE C_HISPLAT C_RAINDALS C_RABLACK C_RAHAWOPI
## : 6 : 4 : 9 : 11 : 7 : 9
## 0 :678 0 : 64 0 :685 0 :693 0 :709 0 :710
## 1 : 36 1 :651 1 : 26 1 : 11 1 : 3 2 : 2
## 2 : 1 2 : 2 2 : 1 2 : 6 2 : 2 NA's:349
## NA's:349 NA's:349 NA's:349 NA's:349 NA's:349
hm3 <- filter(pcdf,grepl('NA',IID))
table(gwas$pop)
##
## Norway PtStudy
## 243 827
table(hm3$pop)
##
## ASN CEU YRI
## 170 111 110
##calc proportion variance explained by each PC
eval <- scan(my.dir %&% 'N88_HM3_LDpruned_2015-11-18.eval')[1:10]
#eval <- scan(my.dir %&% 'N88_HM3_geno0.01_maf0.05_LDpruned.eval')[1:10]
round(eval/sum(eval),3)
## [1] 0.519 0.356 0.029 0.021 0.017 0.014 0.012 0.011 0.010 0.010
#pdf(file=my.dir %&% "PtStudy_PCA.pdf")
ggplot() + geom_point(data=gwas,aes(x=PC1,y=PC2,col=pop,shape=pop))+geom_point(data=hm3,aes(x=PC1,y=PC2,col=pop,shape=pop))+ theme_bw() + scale_colour_brewer(palette="Set1")

ggplot() + geom_point(data=gwas,aes(x=PC1,y=PC3,col=pop,shape=pop))+geom_point(data=hm3,aes(x=PC1,y=PC3,col=pop,shape=pop))+ theme_bw() + scale_colour_brewer(palette="Set1")

ggplot() + geom_point(data=gwas,aes(x=PC2,y=PC3,col=pop,shape=pop))+geom_point(data=hm3,aes(x=PC2,y=PC3,col=pop,shape=pop))+ theme_bw() + scale_colour_brewer(palette="Set1")

#dev.off()
##Norwegians outside of main Norway cluster
norOut<-filter(gwas,pop=="Norway",PC1<=0.013) %>% select(FID,IID,pop,PC1,PC2,PC3) %>% arrange(PC1)
write.table(norOut,pca.dir %&% "GWAS_QC_Norwegian_outliers.txt",quote=FALSE,row.names=FALSE,sep="\t")
norOut
## FID IID pop PC1 PC2 PC3
## 1 N88_Plate03_D05 TC07.R-3441B1 Norway 0.0069 -0.0067 -0.0383
## 2 N88_Plate06_B02 TC07.R-3542B1 Norway 0.0070 -0.0069 -0.0331
## 3 N88_Plate10_E03 TC07.R-3479B1 Norway 0.0082 -0.0048 -0.0336
## 4 N88_Plate12_C08 TC07.R-1208B1 Norway 0.0096 -0.0035 -0.0269
## 5 N88_Plate12_C10 TC07.R-3190B1 Norway 0.0112 -0.0025 -0.0352
## 6 N88_Plate08_F02 TC07.R-1174B1 Norway 0.0123 0.0031 0.0472
## 7 N88_Plate01_A03 TC07.R-3433B1 Norway 0.0125 -0.0010 -0.0255
## 8 N88_Plate02_A04 TC07.R-1101B1 Norway 0.0128 -0.0016 -0.0297
## 9 N88_Plate12_F01 TC07.R-3270B1 Norway 0.0129 -0.0005 -0.0278
## 10 N88_Plate11_B01 TC07.R-3380B1 Norway 0.0130 -0.0004 -0.0267
##Notes from Sophie Fossa re: these patients:
## FID IID pop PC1 PC2 PC3
## 1 N88_Plate03_D05 TC07.R-3441B1 Norway 0.0069 -0.0067 -0.0385 from Tromso
## 3 N88_Plate10_E03 TC07.R-3479B1 Norway 0.0081 -0.0047 -0.0336 from Tromso
## 6 N88_Plate08_F02 TC07.R-1174B1 Norway 0.0117 0.0040 0.0477 Greek (explains PC3)
## 7 N88_Plate01_A03 TC07.R-3433B1 Norway 0.0122 -0.0003 -0.0252 from Tromso
## 10 N88_Plate11_B01 TC07.R-3380B1 Norway 0.0127 0.0004 -0.0274 from Tromso
##Tromso is in northern Norway where Saami/Lapp ancestry is common
##Saami have ~6-13% Asian admixture: http://www.nature.com/ejhg/journal/v16/n11/full/ejhg200888a.html , http://www.nature.com/ejhg/journal/v19/n3/abs/ejhg2010179a.html
#pdf(file=my.dir %&% "PtStudy_PCA_Self-Report.pdf")
##color by C_HISPLAT (self-report Hispanic/Latino)
g<-mutate(gwas,Hispanic=ifelse(is.na(C_HISPLAT)|as.character(C_HISPLAT)=="2"|as.character(C_HISPLAT)=="","NA",as.character(C_HISPLAT))) %>% mutate(Hispanic=ifelse(Hispanic=="0","No",Hispanic)) %>% mutate(Hispanic=ifelse(Hispanic=="1","Yes",Hispanic)) %>% mutate(Hispanic=ifelse(pop=="Norway","Norway",Hispanic))
ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=Hispanic,shape=Hispanic))+ theme_bw()

ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=Hispanic,shape=Hispanic))+ theme_bw() + coord_cartesian(xlim=c(0.005,0.0175),ylim=c(-0.005,0.007)) +ggtitle("ZOOM")

##color by C_RAWHITE (self-report White)
g<-mutate(g,White=ifelse(is.na(C_RAWHITE)|as.character(C_RAWHITE)=="2"|as.character(C_RAWHITE)=="","NA",as.character(C_RAWHITE))) %>% mutate(White=ifelse(White=="0","No",White)) %>% mutate(White=ifelse(White=="1","Yes",White)) %>% mutate(White=ifelse(pop=="Norway","Norway",White))
ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=White,shape=White))+ theme_bw()

ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=White,shape=White))+ theme_bw() + coord_cartesian(xlim=c(0.005,0.0175),ylim=c(-0.005,0.007))+ggtitle("ZOOM")

##color by C_RAASIAN (self-report Asian)
g<-mutate(g,Asian=ifelse(is.na(C_RAASIAN)|as.character(C_RAASIAN)=="2"|as.character(C_RAASIAN)=="","NA",as.character(C_RAASIAN))) %>% mutate(Asian=ifelse(Asian=="0","No",Asian)) %>% mutate(Asian=ifelse(Asian=="1","Yes",Asian))%>% mutate(Asian=ifelse(pop=="Norway","Norway",Asian))
ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=Asian,shape=Asian))+ theme_bw()

ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=Asian,shape=Asian))+ theme_bw() + coord_cartesian(xlim=c(0.005,0.0175),ylim=c(-0.005,0.007))+ggtitle("ZOOM")

##color by C_HISPLAT or C_RAASIAN (Hispanic or Asian)
g<-mutate(g,Hisp_or_Asian=ifelse(Asian=="Yes"|Hispanic=="Yes","Yes",Hispanic))
ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=Hisp_or_Asian,shape=Hisp_or_Asian))+ theme_bw()

ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=Hisp_or_Asian,shape=Hisp_or_Asian))+ theme_bw() + coord_cartesian(xlim=c(0.005,0.0175),ylim=c(-0.005,0.007))+ggtitle("ZOOM")

##color by C_RAINDALS (self-report American Indian or Alaska Native)
g<-mutate(g,AmInd=ifelse(is.na(C_RAINDALS)|as.character(C_RAINDALS)=="2"|as.character(C_RAINDALS)=="","NA",as.character(C_RAINDALS))) %>% mutate(AmInd=ifelse(AmInd=="0","No",AmInd)) %>% mutate(AmInd=ifelse(AmInd=="1","Yes",AmInd))%>% mutate(AmInd=ifelse(pop=="Norway","Norway",AmInd))
ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=AmInd,shape=AmInd))+ theme_bw()

ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=AmInd,shape=AmInd))+ theme_bw() + coord_cartesian(xlim=c(0.005,0.0175),ylim=c(-0.005,0.007))+ggtitle("ZOOM")

##color by C_RABLACK (self-report Black or African American)
g<-mutate(g,Black=ifelse(is.na(C_RABLACK)|as.character(C_RABLACK)=="2"|as.character(C_RABLACK)=="","NA",as.character(C_RABLACK))) %>% mutate(Black=ifelse(Black=="0","No",Black)) %>% mutate(Black=ifelse(Black=="1","Yes",Black))%>% mutate(Black=ifelse(pop=="Norway","Norway",Black))
ggplot() + geom_point(data=g,aes(x=PC1,y=PC2,col=Black,shape=Black))+ theme_bw()

#dev.off()
###choose individuals from PtStudy & Norway for homogeneous GWAS###
ceu <- filter(pcdf,pop=='CEU')
uPC1 <- mean(ceu$PC1) + 10*sd(ceu$PC1)
lPC1 <- mean(ceu$PC1) - 10*sd(ceu$PC1)
uPC2 <- mean(ceu$PC2) + 10*sd(ceu$PC2)
lPC2 <- mean(ceu$PC2) - 10*sd(ceu$PC2)
ggplot() + geom_point(data=gwas,aes(x=PC1,y=PC2,col=pop,shape=pop))+geom_point(data=hm3,aes(x=PC1,y=PC2,col=pop,shape=pop))+ theme_bw() +geom_vline(xintercept=c(uPC1,lPC1)) +geom_hline(yintercept=c(uPC2,lPC2))

inclusion <- gwas[gwas$PC1 >= lPC1,]
inclusion <- inclusion[inclusion$PC1 <= uPC1,]
inclusion <- inclusion[inclusion$PC2 >= lPC2,]
inclusion <- inclusion[inclusion$PC2 <= uPC2,]
samples <- inclusion[,1:2]
table(inclusion$pop)
##
## Norway PtStudy
## 240 713
##number left after rm non-euro (CEU) clustering individuals
dim(samples)[1]
## [1] 953
##number removed in PCA analysis
dim(gwas)[1]-dim(samples)[1]
## [1] 117
excluded <- dplyr::filter(gwas,IID %in% samples$IID==FALSE)
write.table(excluded,file=my.dir %&% "N88.noneuro.10PCs.covfile",quote=FALSE,row.names=FALSE)
ggplot() + geom_point(data=gwas,aes(x=PC1,y=PC2,col=gwas$IID %in% samples$IID,shape=gwas$IID %in% samples$IID))+geom_point(data=hm3,aes(x=PC1,y=PC2,col=pop,shape=pop))+ theme_bw()

write.table(samples,file=my.dir %&% "N88.euro.GWAS.PCAs",quote=F,row.names=F,col.names=F)
##Run smartpca with euro only, see 02_plink_QC.sh
europcs <- read.table(my.dir %&% "N88_euro_LDpruned_2015-11-18.evec",skip=1)
eupcdf <- europcs %>% rename(PC1=V2,PC2=V3,PC3=V4,PC4=V5,PC5=V6,PC6=V7,PC7=V8,PC8=V9,PC9=V10,PC10=V11) %>% mutate(pop=ifelse(grepl("TC",V1),"Norway","PtStudy"))
##calc proportion variance explained by each PC
eval <- scan(my.dir %&% 'N88_euro_LDpruned_2015-11-18.eval')[1:10]
round(eval/sum(eval),3)
## [1] 0.247 0.102 0.096 0.084 0.082 0.080 0.079 0.078 0.077 0.075
ggplot() + geom_point(data=eupcdf,aes(x=PC1,y=PC2,col=pop,shape=pop)) + theme_bw()

ggplot() + geom_point(data=eupcdf,aes(x=PC1,y=PC3,col=pop,shape=pop)) + theme_bw()

ggplot() + geom_point(data=eupcdf,aes(x=PC2,y=PC3,col=pop,shape=pop)) + theme_bw()

just plot Pt Study people
###choose individuals from PtStudy & Norway for homogeneous GWAS###
ceu <- filter(pcdf,pop=='CEU')
uPC1 <- mean(ceu$PC1) + 10*sd(ceu$PC1)
lPC1 <- mean(ceu$PC1) - 10*sd(ceu$PC1)
uPC2 <- mean(ceu$PC2) + 10*sd(ceu$PC2)
lPC2 <- mean(ceu$PC2) - 10*sd(ceu$PC2)
ptgwas <- filter(gwas,pop!="Norway") %>% dplyr::select(FID,IID,pop,starts_with('PC'))
ptgwashm3 <- rbind(ptgwas,hm3) %>% mutate(population=pop)
ptgwashm3$population <- factor(ptgwashm3$population, levels = c('ASN','CEU','YRI','PtStudy'), ordered=T)
ggplot() + geom_point(data=ptgwashm3,aes(x=PC1,y=PC2,col=population,shape=population))+ theme_bw() +geom_vline(xintercept=c(lPC1),col='blue')

tiff(filename=pca.dir %&% "PtStudy_PCPlot.tiff",width=660,height=480)
ggplot() + geom_point(data=ptgwashm3,aes(x=PC1,y=PC2,col=population,shape=population))+ theme_bw(20) +geom_vline(xintercept=c(lPC1),col='blue')
dev.off()
## quartz_off_screen
## 2